This notebook is all about benchmarking some R code used in this package.

Hardware / Software used:

  • Intel i7-4600U
  • Compilation flags for C/C++: -O2 -Wall $(DEBUGFLAG) -mtune=core2 (R’s defaults)
  • Windows Server 2012 R2
  • R 3.3.2 + Intel MKL

Libraries

library(data.table)
data.table 1.10.4
  The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
  Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
  Release notes, videos and slides: http://r-datatable.com
library(microbenchmark)
library(Rcpp)
library(ggplot2)
library(plotly)

Attaching package: <U+393C><U+3E31>plotly<U+393C><U+3E32>

The following object is masked from <U+393C><U+3E31>package:ggplot2<U+393C><U+3E32>:

    last_plot

The following object is masked from <U+393C><U+3E31>package:stats<U+393C><U+3E32>:

    filter

The following object is masked from <U+393C><U+3E31>package:graphics<U+393C><U+3E32>:

    layout
# Helper function to print data well in tables
print_well <- function(data, digits = 6) {
  
  # To milliseconds
  data <- data / 1000000
  
  # Sprintf helper
  sprintf_helper <- paste0("%.0", digits, "f")
  
  cat("| Min | 25% | 50% | 75% | Max | Mean |  \n| --: | --: | --: | --: | --: | --: |  \n| ", sprintf(sprintf_helper, min(data)), " | ", sprintf(sprintf_helper, quantile(data, probs = 0.25)), " | ", sprintf(sprintf_helper, median(data)), " | ", sprintf(sprintf_helper, quantile(data, probs = 0.75)), " | ", sprintf(sprintf_helper, max(data)), " | ", sprintf(sprintf_helper, mean(data)), " |  \n", sep = "")
  
  return(data)
  
}
# Test case function
# Arguments renamed to avoid recursive clash
test_case <- function(f, preds, labels, eps) {
  cat("Test case: ", paste(do.call(f, list(preds = preds[1:50],
                                           labels = labels[1:5])), collapse = ", "), "  \n")
}

Benchmarking Vector to Matrix to Vector

For a 10-class vector of 100,000 observations:

  • Vector A of length=(100000 * 10)
  • Vector B of length=(100000) with 10 classes
A = [1:1, 1:2, 1:3, 1:4... 1:10, 2:1, 2:2, 2:3..., 100000:8, 100000:9, 100000:10]
B = [3, 5, 9, 1, 4, 8, 6, ...]

Get the following Vector C:

C = [1:4, 2:6, 3:10, 4:2, 5:5, 6:9, 7:7, ...]

Initialize data

# Generate random data
set.seed(11111)
data <- runif(10000000, 0, 1)
labels <- round(runif(1000000, 0, 9), digits = 0)
# Background truth example
array(data[1:50], dim = c(10, 5))
           [,1]       [,2]       [,3]       [,4]       [,5]
 [1,] 0.5014483 0.57219649 0.70236924 0.06440384 0.78924978
 [2,] 0.9702328 0.34292525 0.50889166 0.65780657 0.72449914
 [3,] 0.7876004 0.09627503 0.20268701 0.19973930 0.00886554
 [4,] 0.9022259 0.74235690 0.90706612 0.24664551 0.57348710
 [5,] 0.8141778 0.42274539 0.05441064 0.25997440 0.73034459
 [6,] 0.7998922 0.98402494 0.09045308 0.14869691 0.42458612
 [7,] 0.1158690 0.89258437 0.85759800 0.21375685 0.07684022
 [8,] 0.7171363 0.59541565 0.94780036 0.20598170 0.53318628
 [9,] 0.1020639 0.80531234 0.16479666 0.02075780 0.28521238
[10,] 0.6856938 0.42102379 0.42536097 0.63880218 0.86408083
labels[1:5]
[1] 5 1 1 7 8
data[c(1 + labels[1], 11 + labels[2], 21 + labels[3], 31 + labels[4], 41 + labels[5])]
[1] 0.7998922 0.3429253 0.5088917 0.2059817 0.2852124
# How many digits for benchmarking in milliseconds
my_digits <- 6L
# How many runs for benchmarking?
my_runs <- 1000L

Benchmarks

# ===== BLOCK 1 =====
faster1 <- function(preds, labels) {
  data_len <- length(preds)
  label_len <- length(labels)
  len_div <- data_len / label_len
  dims <- c(len_div, label_len)
  label_long <- labels + 1
  mat_out <- cbind(label_long, 1:label_len)
  vec_out <- array(preds, dim = dims)[mat_out]
  return(vec_out)
}
test_case(faster1, preds = data, labels = labels)

Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233

data1 <- print_well(microbenchmark(faster1(data, labels), times = my_runs)$time, digits = my_digits)
Min 25% 50% 75% Max Mean
64.312658 76.900123 82.223366 91.294758 187.028369 94.528352
# ===== BLOCK 2 =====
faster2 <- function(preds, labels) {
  data_len <- length(preds)
  label_len <- length(labels)
  vec_out <- array(preds, dim = c(data_len / label_len, label_len))[cbind(labels + 1, 1:label_len)]
  return(vec_out)
}
test_case(faster2, preds = data, labels = labels)

Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233

data2 <- print_well(microbenchmark(faster2(data, labels), times = my_runs)$time, digits = my_digits)
Min 25% 50% 75% Max Mean
75.522127 77.650702 79.183414 83.235100 164.776690 85.804421
# ===== BLOCK 3 =====
faster3 <- function(preds, labels) {
  return(array(preds, dim = c(length(preds) / length(labels), length(labels)))[cbind(labels + 1, 1:length(labels))])
}
test_case(faster3, preds = data, labels = labels)

Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233

data3 <- print_well(microbenchmark(faster3(data, labels), times = my_runs)$time, digits = my_digits)
Min 25% 50% 75% Max Mean
65.931280 77.665623 79.225228 82.958931 160.472404 82.804429
# ===== BLOCK 4 =====
faster4 <- function(preds, labels) {
  return(array(preds, dim = c(length(preds) / length(labels), length(labels)))[matrix(c(labels + 1, 1:length(labels)), ncol = 2)])
}
test_case(faster4, preds = data, labels = labels)

Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233

data4 <- print_well(microbenchmark(faster4(data, labels), times = my_runs)$time, digits = my_digits)
Min 25% 50% 75% Max Mean
74.609039 87.599544 89.413651 93.436732 172.448608 93.591531
# ===== BLOCK 5 =====
faster5 <- function(preds, labels) {
  return(preds[((0:(length(labels) - 1)) * (length(preds) / length(labels))) + labels + 1])
}
test_case(faster5, preds = data, labels = labels)

Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233

data5 <- print_well(microbenchmark(faster5(data, labels), times = my_runs)$time, digits = my_digits)
Min 25% 50% 75% Max Mean
23.954310 24.815225 25.280228 29.532910 118.538770 28.823157
# ===== BLOCK 6 =====
cppFunction("NumericVector faster6(NumericVector preds, NumericVector labels) {
  int labels_size = labels.size();
  NumericVector selected(labels_size);
  selected = (preds.size() / labels_size) * seq(0, labels_size - 1);
  selected = selected + labels;
  NumericVector to_return(labels_size);
  to_return = preds[selected];
  return to_return;
}")
test_case(faster6, preds = data, labels = labels)

Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233

data6 <- print_well(microbenchmark(faster6(data, labels), times = my_runs)$time, digits = my_digits)
Min 25% 50% 75% Max Mean
21.357217 22.161396 22.644360 27.302364 106.357671 26.335169

Summary Results

data_time <- data.table(rbindlist(list(data.frame(Time = data1, Bench = "faster1"),
                                       data.frame(Time = data2, Bench = "faster2"),
                                       data.frame(Time = data3, Bench = "faster3"),
                                       data.frame(Time = data4, Bench = "faster4"),
                                       data.frame(Time = data5, Bench = "faster5"),
                                       data.frame(Time = data6, Bench = "faster6"))))
data_time <- data_time[, t_mean := mean(Time), by = Bench]
data_time <- data_time[, t_median := median(Time), by = Bench]
data_time$Benchs <- data_time$Bench 
levels(data_time$Benchs) <- paste0("faster", 1:6, "= [", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(min(Time)), by = Bench]$V1), ", ", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(max(Time)), by = Bench]$V1), "], mean=", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(mean(Time)), by = Bench]$V1), ", median=", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(median(Time)), by = Bench]$V1))
my_time <- data_time[, list(min(Time), quantile(Time, probs = 0.25), median(Time), quantile(Time, probs = 0.75), max(Time), mean(Time)), by = Bench]
colnames(my_time) <- c("Function", "Min", "25%", "50%", "75%", "Max", "Mean")
my_time <- my_time[order(Mean, decreasing = FALSE), ]
print(my_time, digits = 6)

Plot Results

ggplotly(ggplot(data = data_time, aes(x = Time)) + geom_histogram(aes(y = ..density..), bins = 20, color = "darkblue", fill = "lightblue") + facet_wrap(~ Benchs, ncol = 2) + geom_vline(aes(xintercept = t_mean), color = "blue", linetype = "dashed", size = 2) + geom_vline(aes(xintercept = t_median), color = "red", linetype = "dashed", size = 2) + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
ggplotly(ggplot(data = data_time[, .(Time, Bench)], aes(x = Time, y = ..count.., fill = Bench)) + geom_histogram(aes(y = ..density..), bins = 100, position = "fill") + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
ggplotly(ggplot(data = data_time, aes(x = Time, y = ..count.., fill = Bench)) + geom_density(position = "fill") + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
---
title: "Benchmarks: vect2mat2vect"
output:
  html_notebook:
    collapsed: no
    theme: united
    toc: yes
    toc_depth: 6
    toc_float: yes
---

This notebook is all about benchmarking some R code used in this package.

Hardware / Software used:

* Intel i7-4600U
* Compilation flags for C/C++: `-O2 -Wall $(DEBUGFLAG) -mtune=core2` (R's defaults)
* Windows Server 2012 R2
* R 3.3.2 + Intel MKL

# Libraries

```{r init}
library(data.table)
library(microbenchmark)
library(Rcpp)
library(ggplot2)
library(plotly)
```

```{r based}

# Helper function to print data well in tables
print_well <- function(data, digits = 6) {
  
  # To milliseconds
  data <- data / 1000000
  
  # Sprintf helper
  sprintf_helper <- paste0("%.0", digits, "f")
  
  cat("| Min | 25% | 50% | 75% | Max | Mean |  \n| --: | --: | --: | --: | --: | --: |  \n| ", sprintf(sprintf_helper, min(data)), " | ", sprintf(sprintf_helper, quantile(data, probs = 0.25)), " | ", sprintf(sprintf_helper, median(data)), " | ", sprintf(sprintf_helper, quantile(data, probs = 0.75)), " | ", sprintf(sprintf_helper, max(data)), " | ", sprintf(sprintf_helper, mean(data)), " |  \n", sep = "")
  
  return(data)
  
}

# Test case function
# Arguments renamed to avoid recursive clash
test_case <- function(f, preds, labels, eps) {
  cat("Test case: ", paste(do.call(f, list(preds = preds[1:50],
                                           labels = labels[1:5])), collapse = ", "), "  \n")
}

```

# Benchmarking Vector to Matrix to Vector

For a 10-class vector of 100,000 observations:

* Vector A of length=(100000 * 10)
* Vector B of length=(100000) with 10 classes

```
A = [1:1, 1:2, 1:3, 1:4... 1:10, 2:1, 2:2, 2:3..., 100000:8, 100000:9, 100000:10]
B = [3, 5, 9, 1, 4, 8, 6, ...]
```

Get the following Vector C:

```
C = [1:4, 2:6, 3:10, 4:2, 5:5, 6:9, 7:7, ...]
```

# Initialize data

```{r bench1}
# Generate random data
set.seed(11111)
data <- runif(10000000, 0, 1)
labels <- round(runif(1000000, 0, 9), digits = 0)

# Background truth example
array(data[1:50], dim = c(10, 5))
labels[1:5]
data[c(1 + labels[1], 11 + labels[2], 21 + labels[3], 31 + labels[4], 41 + labels[5])]

# How many digits for benchmarking in milliseconds
my_digits <- 6L

# How many runs for benchmarking?
my_runs <- 1000L
```

# Benchmarks

```{r bench2, results="asis"}

# ===== BLOCK 1 =====
faster1 <- function(preds, labels) {
  data_len <- length(preds)
  label_len <- length(labels)
  len_div <- data_len / label_len
  dims <- c(len_div, label_len)
  label_long <- labels + 1
  mat_out <- cbind(label_long, 1:label_len)
  vec_out <- array(preds, dim = dims)[mat_out]
  return(vec_out)
}
test_case(faster1, preds = data, labels = labels)
data1 <- print_well(microbenchmark(faster1(data, labels), times = my_runs)$time, digits = my_digits)

# ===== BLOCK 2 =====
faster2 <- function(preds, labels) {
  data_len <- length(preds)
  label_len <- length(labels)
  vec_out <- array(preds, dim = c(data_len / label_len, label_len))[cbind(labels + 1, 1:label_len)]
  return(vec_out)
}
test_case(faster2, preds = data, labels = labels)
data2 <- print_well(microbenchmark(faster2(data, labels), times = my_runs)$time, digits = my_digits)

# ===== BLOCK 3 =====
faster3 <- function(preds, labels) {
  return(array(preds, dim = c(length(preds) / length(labels), length(labels)))[cbind(labels + 1, 1:length(labels))])
}
test_case(faster3, preds = data, labels = labels)
data3 <- print_well(microbenchmark(faster3(data, labels), times = my_runs)$time, digits = my_digits)

# ===== BLOCK 4 =====
faster4 <- function(preds, labels) {
  return(array(preds, dim = c(length(preds) / length(labels), length(labels)))[matrix(c(labels + 1, 1:length(labels)), ncol = 2)])
}
test_case(faster4, preds = data, labels = labels)
data4 <- print_well(microbenchmark(faster4(data, labels), times = my_runs)$time, digits = my_digits)

# ===== BLOCK 5 =====
faster5 <- function(preds, labels) {
  return(preds[((0:(length(labels) - 1)) * (length(preds) / length(labels))) + labels + 1])
}
test_case(faster5, preds = data, labels = labels)
data5 <- print_well(microbenchmark(faster5(data, labels), times = my_runs)$time, digits = my_digits)

# ===== BLOCK 6 =====
cppFunction("NumericVector faster6(NumericVector preds, NumericVector labels) {
  int labels_size = labels.size();
  NumericVector selected(labels_size);
  selected = (preds.size() / labels_size) * seq(0, labels_size - 1);
  selected = selected + labels;
  NumericVector to_return(labels_size);
  to_return = preds[selected];
  return to_return;
}")
test_case(faster6, preds = data, labels = labels)
data6 <- print_well(microbenchmark(faster6(data, labels), times = my_runs)$time, digits = my_digits)

```

# Summary Results

```{r bench3}

data_time <- data.table(rbindlist(list(data.frame(Time = data1, Bench = "faster1"),
                                       data.frame(Time = data2, Bench = "faster2"),
                                       data.frame(Time = data3, Bench = "faster3"),
                                       data.frame(Time = data4, Bench = "faster4"),
                                       data.frame(Time = data5, Bench = "faster5"),
                                       data.frame(Time = data6, Bench = "faster6"))))
data_time <- data_time[, t_mean := mean(Time), by = Bench]
data_time <- data_time[, t_median := median(Time), by = Bench]
data_time$Benchs <- data_time$Bench 
levels(data_time$Benchs) <- paste0("faster", 1:6, "= [", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(min(Time)), by = Bench]$V1), ", ", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(max(Time)), by = Bench]$V1), "], mean=", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(mean(Time)), by = Bench]$V1), ", median=", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(median(Time)), by = Bench]$V1))

my_time <- data_time[, list(min(Time), quantile(Time, probs = 0.25), median(Time), quantile(Time, probs = 0.75), max(Time), mean(Time)), by = Bench]
colnames(my_time) <- c("Function", "Min", "25%", "50%", "75%", "Max", "Mean")
my_time <- my_time[order(Mean, decreasing = FALSE), ]
print(my_time, digits = 6)

```

# Plot Results

```{r bench4, fig.height=6, fig.width=10}

ggplotly(ggplot(data = data_time, aes(x = Time)) + geom_histogram(aes(y = ..density..), bins = 20, color = "darkblue", fill = "lightblue") + facet_wrap(~ Benchs, ncol = 2) + geom_vline(aes(xintercept = t_mean), color = "blue", linetype = "dashed", size = 2) + geom_vline(aes(xintercept = t_median), color = "red", linetype = "dashed", size = 2) + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())

```

```{r bench5, fig.height=6, fig.width=10}
ggplotly(ggplot(data = data_time[, .(Time, Bench)], aes(x = Time, y = ..count.., fill = Bench)) + geom_histogram(aes(y = ..density..), bins = 100, position = "fill") + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
```

```{r bench6, fig.height=6, fig.width=10}
ggplotly(ggplot(data = data_time, aes(x = Time, y = ..count.., fill = Bench)) + geom_density(position = "fill") + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
```